import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:,.2f}'.format
%%HTML
<script src="require.js"></script>
import plotly.io as pio
pio.renderers.default='notebook'
plt.rcParams['font.sans-serif']= ['Microsoft JhengHei']
plt.rcParams['axes.unicode_minus']=False
df = pd.read_csv("Bakery sales.csv",index_col=0)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 234005 entries, 0 to 511395 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 234005 non-null object 1 time 234005 non-null object 2 ticket_number 234005 non-null float64 3 article 234005 non-null object 4 Quantity 234005 non-null float64 5 unit_price 234005 non-null object dtypes: float64(2), object(4) memory usage: 12.5+ MB
df.head()
| date | time | ticket_number | article | Quantity | unit_price | |
|---|---|---|---|---|---|---|
| 0 | 2021-01-02 | 08:38 | 150,040.00 | BAGUETTE | 1.00 | 0,90 € |
| 1 | 2021-01-02 | 08:38 | 150,040.00 | PAIN AU CHOCOLAT | 3.00 | 1,20 € |
| 4 | 2021-01-02 | 09:14 | 150,041.00 | PAIN AU CHOCOLAT | 2.00 | 1,20 € |
| 5 | 2021-01-02 | 09:14 | 150,041.00 | PAIN | 1.00 | 1,15 € |
| 8 | 2021-01-02 | 09:25 | 150,042.00 | TRADITIONAL BAGUETTE | 5.00 | 1,20 € |
df.isna().values.sum()
0
df.duplicated().values.sum()
1210
We can see there was lots of duplicated rows. Due to the same ticket number, we can assure those duplicated data that should not be actual sales data.
df[df.duplicated(keep=False)].head(30)
| date | time | ticket_number | article | Quantity | unit_price | |
|---|---|---|---|---|---|---|
| 76 | 2021-01-02 | 10:12 | 150,061.00 | COUPE | 1.00 | 0,15 € |
| 78 | 2021-01-02 | 10:12 | 150,061.00 | COUPE | 1.00 | 0,15 € |
| 86 | 2021-01-02 | 10:18 | 150,063.00 | COUPE | 1.00 | 0,15 € |
| 89 | 2021-01-02 | 10:18 | 150,063.00 | COUPE | 1.00 | 0,15 € |
| 446 | 2021-01-02 | 12:23 | 150,154.00 | TRADITIONAL BAGUETTE | 1.00 | 1,20 € |
| 447 | 2021-01-02 | 12:23 | 150,154.00 | TRADITIONAL BAGUETTE | 1.00 | 1,20 € |
| 464 | 2021-01-02 | 12:28 | 150,159.00 | COUPE | 1.00 | 0,15 € |
| 467 | 2021-01-02 | 12:28 | 150,159.00 | COUPE | 1.00 | 0,15 € |
| 653 | 2021-01-02 | 13:17 | 150,206.00 | COUPE | 1.00 | 0,15 € |
| 655 | 2021-01-02 | 13:17 | 150,206.00 | COUPE | 1.00 | 0,15 € |
| 720 | 2021-01-02 | 13:44 | 150,222.00 | COUPE | 1.00 | 0,15 € |
| 722 | 2021-01-02 | 13:44 | 150,222.00 | COUPE | 1.00 | 0,15 € |
| 730 | 2021-01-02 | 13:55 | 150,224.00 | COUPE | 1.00 | 0,15 € |
| 731 | 2021-01-02 | 13:55 | 150,224.00 | COUPE | 1.00 | 0,15 € |
| 2341 | 2021-01-05 | 12:38 | 150,675.00 | COUPE | 1.00 | 0,15 € |
| 2343 | 2021-01-05 | 12:38 | 150,675.00 | COUPE | 1.00 | 0,15 € |
| 3816 | 2021-01-09 | 11:38 | 151,091.00 | COUPE | 1.00 | 0,15 € |
| 3821 | 2021-01-09 | 11:38 | 151,091.00 | COUPE | 1.00 | 0,15 € |
| 3822 | 2021-01-09 | 11:38 | 151,091.00 | COUPE | 1.00 | 0,15 € |
| 4226 | 2021-01-09 | 19:05 | 151,203.00 | COUPE | 1.00 | 0,15 € |
| 4229 | 2021-01-09 | 19:05 | 151,203.00 | COUPE | 1.00 | 0,15 € |
| 4411 | 2021-01-10 | 10:57 | 151,251.00 | GAL FRANGIPANE 6P | 1.00 | 12,00 € |
| 4413 | 2021-01-10 | 10:57 | 151,251.00 | GAL FRANGIPANE 6P | 1.00 | 12,00 € |
| 5485 | 2021-01-12 | 09:52 | 151,550.00 | COUPE | 1.00 | 0,15 € |
| 5488 | 2021-01-12 | 09:52 | 151,550.00 | COUPE | 1.00 | 0,15 € |
| 5761 | 2021-01-12 | 13:41 | 151,628.00 | SANDWICH COMPLET | 1.00 | 4,50 € |
| 5762 | 2021-01-12 | 13:41 | 151,628.00 | SANDWICH COMPLET | 1.00 | 4,50 € |
| 6181 | 2021-01-14 | 13:33 | 151,748.00 | COUPE | 1.00 | 0,15 € |
| 6184 | 2021-01-14 | 13:33 | 151,748.00 | COUPE | 1.00 | 0,15 € |
| 8048 | 2021-01-17 | 13:20 | 152,266.00 | COUPE | 1.00 | 0,15 € |
df.drop_duplicates(inplace=True)
df.Quantity = df.Quantity.astype(int)
df.unit_price = df.unit_price.str.replace(' €', '').str.replace(',','.').astype(float)
df["date"] = pd.to_datetime(df["date"] + " " + df["time"])
df.rename(columns={"date":"date_time"},inplace=True)
df
| date_time | time | ticket_number | article | Quantity | unit_price | |
|---|---|---|---|---|---|---|
| 0 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | BAGUETTE | 1 | 0.90 |
| 1 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | PAIN AU CHOCOLAT | 3 | 1.20 |
| 4 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN AU CHOCOLAT | 2 | 1.20 |
| 5 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN | 1 | 1.15 |
| 8 | 2021-01-02 09:25:00 | 09:25 | 150,042.00 | TRADITIONAL BAGUETTE | 5 | 1.20 |
| ... | ... | ... | ... | ... | ... | ... |
| 511387 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 1 | 0.15 |
| 511388 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | BOULE 200G | 1 | 1.20 |
| 511389 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 2 | 0.15 |
| 511392 | 2022-09-30 18:55:00 | 18:55 | 288,912.00 | TRADITIONAL BAGUETTE | 1 | 1.30 |
| 511395 | 2022-09-30 18:56:00 | 18:56 | 288,913.00 | TRADITIONAL BAGUETTE | 1 | 1.30 |
232795 rows × 6 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 232795 entries, 0 to 511395 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date_time 232795 non-null datetime64[ns] 1 time 232795 non-null object 2 ticket_number 232795 non-null float64 3 article 232795 non-null object 4 Quantity 232795 non-null int32 5 unit_price 232795 non-null float64 dtypes: datetime64[ns](1), float64(2), int32(1), object(2) memory usage: 11.5+ MB
df.describe()
| ticket_number | Quantity | unit_price | |
|---|---|---|---|
| count | 232,795.00 | 232,795.00 | 232,795.00 |
| mean | 219,217.35 | 1.54 | 1.67 |
| std | 40,063.04 | 1.29 | 1.72 |
| min | 150,040.00 | -200.00 | 0.00 |
| 25% | 184,743.00 | 1.00 | 1.10 |
| 50% | 218,841.00 | 1.00 | 1.20 |
| 75% | 253,965.50 | 2.00 | 1.50 |
| max | 288,913.00 | 200.00 | 60.00 |
df["Y"] = df["date_time"].dt.year
df["M"] = df["date_time"].dt.month
df["D"] = df["date_time"].dt.day
df["WD"] = df["date_time"].dt.dayofweek + 1
df
| date_time | time | ticket_number | article | Quantity | unit_price | Y | M | D | WD | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | BAGUETTE | 1 | 0.90 | 2021 | 1 | 2 | 6 |
| 1 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | PAIN AU CHOCOLAT | 3 | 1.20 | 2021 | 1 | 2 | 6 |
| 4 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN AU CHOCOLAT | 2 | 1.20 | 2021 | 1 | 2 | 6 |
| 5 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN | 1 | 1.15 | 2021 | 1 | 2 | 6 |
| 8 | 2021-01-02 09:25:00 | 09:25 | 150,042.00 | TRADITIONAL BAGUETTE | 5 | 1.20 | 2021 | 1 | 2 | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 511387 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 1 | 0.15 | 2022 | 9 | 30 | 5 |
| 511388 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | BOULE 200G | 1 | 1.20 | 2022 | 9 | 30 | 5 |
| 511389 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 2 | 0.15 | 2022 | 9 | 30 | 5 |
| 511392 | 2022-09-30 18:55:00 | 18:55 | 288,912.00 | TRADITIONAL BAGUETTE | 1 | 1.30 | 2022 | 9 | 30 | 5 |
| 511395 | 2022-09-30 18:56:00 | 18:56 | 288,913.00 | TRADITIONAL BAGUETTE | 1 | 1.30 | 2022 | 9 | 30 | 5 |
232795 rows × 10 columns
df["REV"] = df.Quantity * df.unit_price
df
| date_time | time | ticket_number | article | Quantity | unit_price | Y | M | D | WD | REV | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | BAGUETTE | 1 | 0.90 | 2021 | 1 | 2 | 6 | 0.90 |
| 1 | 2021-01-02 08:38:00 | 08:38 | 150,040.00 | PAIN AU CHOCOLAT | 3 | 1.20 | 2021 | 1 | 2 | 6 | 3.60 |
| 4 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN AU CHOCOLAT | 2 | 1.20 | 2021 | 1 | 2 | 6 | 2.40 |
| 5 | 2021-01-02 09:14:00 | 09:14 | 150,041.00 | PAIN | 1 | 1.15 | 2021 | 1 | 2 | 6 | 1.15 |
| 8 | 2021-01-02 09:25:00 | 09:25 | 150,042.00 | TRADITIONAL BAGUETTE | 5 | 1.20 | 2021 | 1 | 2 | 6 | 6.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 511387 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 1 | 0.15 | 2022 | 9 | 30 | 5 | 0.15 |
| 511388 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | BOULE 200G | 1 | 1.20 | 2022 | 9 | 30 | 5 | 1.20 |
| 511389 | 2022-09-30 18:52:00 | 18:52 | 288,911.00 | COUPE | 2 | 0.15 | 2022 | 9 | 30 | 5 | 0.30 |
| 511392 | 2022-09-30 18:55:00 | 18:55 | 288,912.00 | TRADITIONAL BAGUETTE | 1 | 1.30 | 2022 | 9 | 30 | 5 | 1.30 |
| 511395 | 2022-09-30 18:56:00 | 18:56 | 288,913.00 | TRADITIONAL BAGUETTE | 1 | 1.30 | 2022 | 9 | 30 | 5 | 1.30 |
232795 rows × 11 columns
monthly = df.groupby("M", as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
# 計算月平均值
monthly.loc[:8,monthly.columns.difference(["M"])] = monthly.loc[:8,monthly.columns.difference(["M"])].applymap(lambda x:x/2)
monthly
| M | total_sales_volume | client_num | total_rev | |
|---|---|---|---|---|
| 0 | 1 | 9,188.00 | 6,392.00 | 15,448.76 |
| 1 | 2 | 11,641.50 | 7,766.50 | 17,206.95 |
| 2 | 3 | 12,747.00 | 8,696.50 | 19,695.38 |
| 3 | 4 | 15,692.00 | 10,461.00 | 24,640.67 |
| 4 | 5 | 18,499.00 | 11,987.00 | 29,422.50 |
| 5 | 6 | 15,969.00 | 10,723.00 | 24,920.38 |
| 6 | 7 | 28,661.50 | 17,817.50 | 44,281.71 |
| 7 | 8 | 33,309.00 | 20,117.00 | 51,155.97 |
| 8 | 9 | 14,311.00 | 9,670.00 | 22,256.38 |
| 9 | 10 | 14,924.00 | 9,817.00 | 22,682.05 |
| 10 | 11 | 11,744.00 | 7,913.00 | 18,025.75 |
| 11 | 12 | 12,042.00 | 7,804.00 | 18,999.25 |
This chart shows that July and August are the important months in a year.
fig = px.line(monthly,x="M",y=["client_num","total_sales_volume","total_rev"],title="Monthly number of clients, total sales volume and total revenue")
fig.show()
July and August can account for over 30% revenue in a year.
fig = px.pie(monthly,names="M",values="total_rev",hole=0.5,title="How much revenue does each month account for?")
fig.update_traces(textposition='outside',textinfo='percent+label',sort=False)
fig.show()
daily = df.groupby("D", as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
fig = px.line(daily,x="D",y=["client_num","total_sales_volume","total_rev"],title="Daily number of clients, total sales volume and total revenue")
fig.show()
weekly = df.groupby("WD", as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
fig = px.line(weekly,x="WD",y=["client_num","total_sales_volume","total_rev"],title="Number of clients, total sales volume and total revenue inside a week")
fig.show()
df["hr"] = df["date_time"].dt.hour
hourly = df.groupby("hr", as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
fig = px.line(hourly,x="hr",y=["client_num","total_sales_volume","total_rev"],title="number of clients, total sales volume and total revenue inside a day")
fig.show()
# pick up the best selling breads
top5_breads = df.groupby("article").agg({"Quantity":"sum"}).sort_values("Quantity",ascending=False).head().index
top5_breads
Index(['TRADITIONAL BAGUETTE', 'CROISSANT', 'PAIN AU CHOCOLAT', 'BANETTE',
'COUPE'],
dtype='object', name='article')
top5_br_df = df[df.article.isin(top5_breads)]
monthly_br = top5_br_df.groupby("article").resample("M", on="date_time").agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum")).reset_index()
monthly_br
| article | date_time | total_sales_volume | client_num | total_rev | |
|---|---|---|---|---|---|
| 0 | BANETTE | 2021-01-31 | 649 | 448 | 681.45 |
| 1 | BANETTE | 2021-02-28 | 681 | 454 | 715.05 |
| 2 | BANETTE | 2021-03-31 | 866 | 573 | 909.30 |
| 3 | BANETTE | 2021-04-30 | 1034 | 699 | 1,085.70 |
| 4 | BANETTE | 2021-05-31 | 1180 | 765 | 1,239.00 |
| ... | ... | ... | ... | ... | ... |
| 100 | TRADITIONAL BAGUETTE | 2022-05-31 | 5684 | 3386 | 7,105.00 |
| 101 | TRADITIONAL BAGUETTE | 2022-06-30 | 5010 | 3118 | 6,277.90 |
| 102 | TRADITIONAL BAGUETTE | 2022-07-31 | 10013 | 5573 | 13,016.90 |
| 103 | TRADITIONAL BAGUETTE | 2022-08-31 | 12593 | 6653 | 16,370.90 |
| 104 | TRADITIONAL BAGUETTE | 2022-09-30 | 4395 | 2761 | 5,713.50 |
105 rows × 5 columns
fig = px.bar(monthly_br,x="date_time",y="total_rev",color="article",title="Revenue by Month")
fig.show()
monthly_br["avg_purchase_Qty"] = monthly_br["total_sales_volume"]/monthly_br["client_num"]
fig = px.line(monthly_br,x="date_time",y="avg_purchase_Qty",color="article",title="Average Purchase Quantity by Month")
fig.show()
daily_br = top5_br_df.groupby(["article","D"],as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
daily_br["avg_purchase_Qty"] = daily_br["total_sales_volume"]/daily_br["client_num"]
fig = px.line(daily_br,x="D",y="avg_purchase_Qty",color="article",title="Average Purchase Quantity by Day")
fig.show()
weekly_br = top5_br_df.groupby(["article","WD"],as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
weekly_br["avg_purchase_Qty"] = weekly_br["total_sales_volume"]/weekly_br["client_num"]
fig = px.line(weekly_br,x="WD",y="avg_purchase_Qty",color="article",title="Average Purchase Quantity inside a Week")
fig.show()
hourly_br = top5_br_df.groupby(["article","hr"],as_index=False).agg(total_sales_volume=("Quantity","sum"), client_num=("ticket_number","count"),total_rev=("REV","sum"))
hourly_br["avg_purchase_Qty"] = hourly_br["total_sales_volume"]/hourly_br["client_num"]
fig = px.line(hourly_br,x="hr",y="avg_purchase_Qty",color="article",title="Average Purchase Quantity by Hour")
fig.show()
If there's any suggestion, please feel free to leave your comments. And if you think this notebook is useful, you can fork or upvote it.